import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor
(3376, 46)
print(Data.isnull().sum().sum()," mssing number out of ",Data.isnull().sum().sum()+Data.notna().sum().sum())
19952 mssing number out of 155296
print(np.round((Data.isnull().sum().sum()*100) /((Data.isnull().sum().sum()+Data.notna().sum().sum()))), "% of missing data")
13.0 % of missing data
OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | Address | City | State | ZipCode | TaxParcelIdentificationNumber | ... | Electricity(kWh) | Electricity(kBtu) | NaturalGas(therms) | NaturalGas(kBtu) | DefaultData | Comments | ComplianceStatus | Outlier | TotalGHGEmissions | GHGEmissionsIntensity | |
0 | 1 | 2016 | NonResidential | Hotel | Mayflower park hotel | 405 Olive way | Seattle | WA | 98101.0 | 0659000030 | ... | 1.156514e+06 | 3946027.0 | 12764.52930 | 1276453.0 | False | NaN | Compliant | NaN | 249.98 | 2.83 |
1 | 2 | 2016 | NonResidential | Hotel | Paramount Hotel | 724 Pine street | Seattle | WA | 98101.0 | 0659000220 | ... | 9.504252e+05 | 3242851.0 | 51450.81641 | 5145082.0 | False | NaN | Compliant | NaN | 295.86 | 2.86 |
2 | 3 | 2016 | NonResidential | Hotel | 5673-The Westin Seattle | 1900 5th Avenue | Seattle | WA | 98101.0 | 0659000475 | ... | 1.451544e+07 | 49526664.0 | 14938.00000 | 1493800.0 | False | NaN | Compliant | NaN | 2089.28 | 2.19 |
3 rows × 46 columns
OSEBuildingID | DataYear | ZipCode | CouncilDistrictCode | Latitude | Longitude | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | ... | SiteEnergyUseWN(kBtu) | SteamUse(kBtu) | Electricity(kWh) | Electricity(kBtu) | NaturalGas(therms) | NaturalGas(kBtu) | DefaultData | Comments | TotalGHGEmissions | GHGEmissionsIntensity | |
count | 3376.000000 | 3376.0 | 3360.000000 | 3376.000000 | 3376.000000 | 3376.000000 | 3376.000000 | 3368.000000 | 3376.000000 | 3.376000e+03 | ... | 3.370000e+03 | 3.367000e+03 | 3.367000e+03 | 3.367000e+03 | 3.367000e+03 | 3.367000e+03 | 3376 | 0.0 | 3367.000000 | 3367.000000 |
unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2 | NaN | NaN | NaN |
top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | NaN |
freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 3263 | NaN | NaN | NaN |
mean | 21208.991114 | 2016.0 | 98116.949107 | 4.439277 | 47.624033 | -122.334795 | 1968.573164 | 1.106888 | 4.709123 | 9.483354e+04 | ... | 5.276726e+06 | 2.745959e+05 | 1.086639e+06 | 3.707612e+06 | 1.368505e+04 | 1.368505e+06 | NaN | NaN | 119.723971 | 1.175916 |
std | 12223.757015 | 0.0 | 18.615205 | 2.120625 | 0.047758 | 0.027203 | 33.088156 | 2.108402 | 5.494465 | 2.188376e+05 | ... | 1.593879e+07 | 3.912173e+06 | 4.352478e+06 | 1.485066e+07 | 6.709781e+04 | 6.709781e+06 | NaN | NaN | 538.832227 | 1.821452 |
min | 1.000000 | 2016.0 | 98006.000000 | 1.000000 | 47.499170 | -122.414250 | 1900.000000 | 0.000000 | 0.000000 | 1.128500e+04 | ... | 0.000000e+00 | 0.000000e+00 | -3.382680e+04 | -1.154170e+05 | 0.000000e+00 | 0.000000e+00 | NaN | NaN | -0.800000 | -0.020000 |
25% | 19990.750000 | 2016.0 | 98105.000000 | 3.000000 | 47.599860 | -122.350662 | 1948.000000 | 1.000000 | 2.000000 | 2.848700e+04 | ... | 9.701822e+05 | 0.000000e+00 | 1.874229e+05 | 6.394870e+05 | 0.000000e+00 | 0.000000e+00 | NaN | NaN | 9.495000 | 0.210000 |
50% | 23112.000000 | 2016.0 | 98115.000000 | 4.000000 | 47.618675 | -122.332495 | 1975.000000 | 1.000000 | 4.000000 | 4.417500e+04 | ... | 1.904452e+06 | 0.000000e+00 | 3.451299e+05 | 1.177583e+06 | 3.237538e+03 | 3.237540e+05 | NaN | NaN | 33.920000 | 0.610000 |
75% | 25994.250000 | 2016.0 | 98122.000000 | 7.000000 | 47.657115 | -122.319407 | 1997.000000 | 1.000000 | 5.000000 | 9.099200e+04 | ... | 4.381429e+06 | 0.000000e+00 | 8.293178e+05 | 2.829632e+06 | 1.189033e+04 | 1.189034e+06 | NaN | NaN | 93.940000 | 1.370000 |
max | 50226.000000 | 2016.0 | 98272.000000 | 7.000000 | 47.733870 | -122.220966 | 2015.000000 | 111.000000 | 99.000000 | 9.320156e+06 | ... | 4.716139e+08 | 1.349435e+08 | 1.925775e+08 | 6.570744e+08 | 2.979090e+06 | 2.979090e+08 | NaN | NaN | 16870.980000 | 34.090000 |
11 rows × 31 columns
['OSEBuildingID', 'DataYear', 'ZipCode', 'CouncilDistrictCode', 'Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseTypeGFA', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)', 'DefaultData', 'Comments', 'TotalGHGEmissions', 'GHGEmissionsIntensity'] 31
['BuildingType', 'PrimaryPropertyType', 'PropertyName', 'Address', 'City', 'State', 'TaxParcelIdentificationNumber', 'Neighborhood', 'ListOfAllPropertyUseTypes', 'LargestPropertyUseType', 'SecondLargestPropertyUseType', 'ThirdLargestPropertyUseType', 'YearsENERGYSTARCertified', 'ComplianceStatus', 'Outlier'] 15
array(['NonResidential', 'Nonresidential COS', 'Multifamily MR (5-9)', 'SPS-District K-12', 'Campus', 'Multifamily LR (1-4)', 'Multifamily HR (10+)', 'Nonresidential WA'], dtype=object)
print("NonResidential:", len(Data[Data['BuildingType'].isin(["NonResidential"])]))
NonResidential: 1460
print("Nonresidential COS:", len(Data[Data['BuildingType'].isin(["Nonresidential COS"])]))
Nonresidential COS: 85
print("Nonresidential WA:", len(Data[Data['BuildingType'].isin(["Nonresidential WA"])]))
Nonresidential WA: 1
DataNR=Data[Data['BuildingType'].isin(["NonResidential","Nonresidential COS","Nonresidential WA"])].copy()
['OSEBuildingID', 'DataYear', 'ZipCode', 'CouncilDistrictCode', 'Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseTypeGFA', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)', 'DefaultData', 'Comments', 'TotalGHGEmissions', 'GHGEmissionsIntensity']
SiteEnergyUse(kBtu):The annual amount of energy consumed by the property from all sources of energy.
TotalGHGEmissions:The total amount of greenhouse gas emissions, including carbon dioxide, methane, and nitrous oxide gases released into the atmosphere as a result of energy consumption at the property
array(['Hotel', 'Other', 'Mixed Use Property', 'University', 'Small- and Mid-Sized Office', 'Self-Storage Facility', 'Warehouse', 'K-12 School', 'Large Office', 'Senior Care Community', 'Medical Office', 'Retail Store', 'Hospital', 'Residence Hall', 'Distribution Center', 'Worship Facility', 'Supermarket / Grocery Store', 'Laboratory', 'Refrigerated Warehouse', 'Restaurant', 'Low-Rise Multifamily', 'Office'], dtype=object)
array([ 1., 3., 0., 2., 4., 6., 9., 5., nan, 7., 8.])
array([ 88434, 103566, 956110, ..., 13157, 14101, 18258])
DataNRCO2E=DataNR[['OSEBuildingID','PrimaryPropertyType','Latitude', 'Longitude','Neighborhood', 'YearBuilt','NumberofBuildings','NumberofFloors','TotalGHGEmissions','SiteEnergyUse(kBtu)','ENERGYSTARScore']].copy()
OSEBuildingID | PrimaryPropertyType | Latitude | Longitude | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | TotalGHGEmissions | SiteEnergyUse(kBtu) | ENERGYSTARScore | BuildingAge | |
0 | 1 | Hotel | 47.61220 | -122.33799 | DOWNTOWN | 1927 | 1.0 | 12 | 249.98 | 7.226362e+06 | 60.0 | 89 |
1 | 2 | Hotel | 47.61317 | -122.33393 | DOWNTOWN | 1996 | 1.0 | 11 | 295.86 | 8.387933e+06 | 61.0 | 20 |
2 | 3 | Hotel | 47.61393 | -122.33810 | DOWNTOWN | 1969 | 1.0 | 41 | 2089.28 | 7.258702e+07 | 43.0 | 47 |
3 | 5 | Hotel | 47.61412 | -122.33664 | DOWNTOWN | 1926 | 1.0 | 10 | 286.43 | 6.794584e+06 | 56.0 | 90 |
4 | 8 | Hotel | 47.61375 | -122.34047 | DOWNTOWN | 1980 | 1.0 | 18 | 505.01 | 1.417261e+07 | 75.0 | 36 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3371 | 50222 | Office | 47.56722 | -122.31154 | GREATER DUWAMISH | 1990 | 1.0 | 1 | 20.94 | 8.497457e+05 | 46.0 | 26 |
3372 | 50223 | Other | 47.59625 | -122.32283 | DOWNTOWN | 2004 | 1.0 | 1 | 32.17 | 9.502762e+05 | NaN | 12 |
3373 | 50224 | Other | 47.63644 | -122.35784 | MAGNOLIA / QUEEN ANNE | 1974 | 1.0 | 1 | 223.54 | 5.765898e+06 | NaN | 42 |
3374 | 50225 | Mixed Use Property | 47.52832 | -122.32431 | GREATER DUWAMISH | 1989 | 1.0 | 1 | 22.11 | 7.194712e+05 | NaN | 27 |
3375 | 50226 | Mixed Use Property | 47.53939 | -122.29536 | GREATER DUWAMISH | 1938 | 1.0 | 1 | 41.27 | 1.152896e+06 | NaN | 78 |
1546 rows × 12 columns
OSEBuildingID | PrimaryPropertyType | Latitude | Longitude | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | TotalGHGEmissions | SiteEnergyUse(kBtu) | ENERGYSTARScore | BuildingAge | |
0 | 1 | Hotel | 47.61220 | -122.33799 | DOWNTOWN | 1927 | 1.0 | 12 | 249.98 | 7.226362e+06 | 60.0 | 89 |
1 | 2 | Hotel | 47.61317 | -122.33393 | DOWNTOWN | 1996 | 1.0 | 11 | 295.86 | 8.387933e+06 | 61.0 | 20 |
2 | 3 | Hotel | 47.61393 | -122.33810 | DOWNTOWN | 1969 | 1.0 | 41 | 2089.28 | 7.258702e+07 | 43.0 | 47 |
3 | 5 | Hotel | 47.61412 | -122.33664 | DOWNTOWN | 1926 | 1.0 | 10 | 286.43 | 6.794584e+06 | 56.0 | 90 |
4 | 8 | Hotel | 47.61375 | -122.34047 | DOWNTOWN | 1980 | 1.0 | 18 | 505.01 | 1.417261e+07 | 75.0 | 36 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3339 | 50069 | Small- and Mid-Sized Office | 47.53161 | -122.29944 | GREATER DUWAMISH | 1929 | 1.0 | 2 | 134.80 | 4.420650e+06 | 9.0 | 87 |
3347 | 50081 | K-12 School | 47.58831 | -122.30650 | GREATER DUWAMISH | 2015 | 1.0 | 3 | 9.24 | 1.325973e+06 | 77.0 | 1 |
3366 | 50210 | Office | 47.63572 | -122.37525 | MAGNOLIA / QUEEN ANNE | 1952 | 1.0 | 1 | 3.50 | 5.026677e+05 | 75.0 | 64 |
3369 | 50220 | Office | 47.56440 | -122.27813 | SOUTHEAST | 1960 | 1.0 | 1 | 7.79 | 3.878100e+05 | 93.0 | 56 |
3371 | 50222 | Office | 47.56722 | -122.31154 | GREATER DUWAMISH | 1990 | 1.0 | 1 | 20.94 | 8.497457e+05 | 46.0 | 26 |
1006 rows × 12 columns
We have categorical features we need to transform them with One Hot Encoder
PrimaryPropertyType | Neighborhood | |
count | 1006 | 1006 |
unique | 18 | 18 |
top | Small- and Mid-Sized Office | DOWNTOWN |
freq | 238 | 249 |
DataNRCO2E.drop(DataNRCO2E[DataNRCO2E["PrimaryPropertyType"].isin(['Low-Rise Multifamily','Residence Hall'])].index,axis=0,inplace=True)
<ipython-input-27-3b9fa3907873>:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction. DataNRCO2E.skew()
OSEBuildingID 0.345459 Latitude 0.247815 Longitude 0.011083 YearBuilt -0.394848 NumberofBuildings 10.418149 NumberofFloors 4.874361 TotalGHGEmissions 12.471995 SiteEnergyUse(kBtu) 8.861597 ENERGYSTARScore -0.675025 BuildingAge 0.394848 dtype: float64
DataNRCO2E.hist(alpha=0.5, figsize=(20, 10))
SiteEnergyUse(kBtu) after log transformation
/usr/local/lib/python3.8/dist-packages/seaborn/ FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='SiteEnergyUse(kBtu)', ylabel='Density'>
TotalGHGEmissions after log transformation
/usr/local/lib/python3.8/dist-packages/seaborn/ FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='TotalGHGEmissions', ylabel='Density'>
from numpy import mean, std, absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
X,y=DataNRCO2E.drop(["OSEBuildingID","TotalGHGEmissions","SiteEnergyUse(kBtu)"], axis=1), DataNRCO2E["TotalGHGEmissions"]
print(X.shape, y.shape)
(986, 9) (986,)
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', StandardScaler(), numerical_ix.values)]
col_transform = ColumnTransformer(transformers=t)
ColumnTransformer(transformers=[('cat', OneHotEncoder(), Index(['PrimaryPropertyType', 'Neighborhood'], dtype='object')), ('num', StandardScaler(), array(['Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'ENERGYSTARScore', 'BuildingAge'], dtype=object))])
# define the models
modelDummyReg= DummyRegressor(strategy="mean")
modelLR= LinearRegression()
modelSVR = SVR(kernel='rbf',gamma='scale',C=100)
modelXGB_T1 = XGBRegressor()
# define the data preparation and modeling pipeline
pipelineDummyReg = Pipeline(steps=[('prep',col_transform), ('m', modelDummyReg)])
pipelineLR = Pipeline(steps=[('prep',col_transform), ('m', modelLR)])
pipelineSVR = Pipeline(steps=[('prep',col_transform), ('m', modelSVR)])
pipelineXGB = Pipeline(steps=[('prep',col_transform), ('m', modelXGB_T1)])
# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresDummyReg = absolute(scoresDummyReg)
# summarize the model performance
print('DummyReg MAE: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))
DummyReg MAE: 1.151 (0.091)
# evaluate the pipeline using cross validation and calculate MAE
scoresLR = cross_val_score(pipelineLR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresLR = absolute(scoresLR)
# summarize the model performance
print('LinearRegression MAE: %.3f (%.3f)' % (mean(scoresLR), std(scoresLR)))
LinearRegression MAE: 0.841 (0.066)
# evaluate the pipeline using cross validation and calculate MAE
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresSVR = absolute(scoresSVR)
# summarize the model performance
print('SVR MAE: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))
SVR MAE: 1.005 (0.062)
# evaluate the pipeline using cross validation and calculate MAE
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresXGB = absolute(scoresXGB)
# summarize the model performance
print('XGB MAE: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))
XGB MAE: 0.806 (0.088)
# evaluate the pipeline using cross validation and calculate R2
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('DummyReg r2: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))
DummyReg r2: -0.010 (0.010)
# evaluate the pipeline using cross validation and calculate R2
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('SVR r2: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))
SVR r2: 0.158 (0.128)
# evaluate the pipeline using cross validation and calculate r2
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('XGB r2: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))
XGB r2: 0.468 (0.082)
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
transformer = make_column_transformer(
(OneHotEncoder(), ['PrimaryPropertyType', 'Neighborhood']),
transformed = transformer.fit_transform(X).toarray()
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
X_train,X_test,y_train,y_test = train_test_split(transformed_df,y,test_size=0.3,random_state=42),y_train)
[09:32:27] WARNING: /workspace/src/objective/ reg:linear is now deprecated in favor of reg:squarederror.
ypred | TotalGHGEmissions | |
0 | 319.701172 | 249.98 |
1 | 304.534607 | 295.86 |
2 | 990.298706 | 2089.28 |
3 | 319.701172 | 286.43 |
4 | 342.030884 | 505.01 |
... | ... | ... |
981 | 26.405626 | 134.80 |
982 | 24.814886 | 9.24 |
983 | 28.213591 | 3.50 |
984 | 22.537758 | 7.79 |
985 | 30.965725 | 20.94 |
986 rows × 2 columns
0 | |
38 | 0.152405 |
12 | 0.123078 |
18 | 0.082017 |
13 | 0.072722 |
14 | 0.053095 |
39 | 0.043498 |
11 | 0.031702 |
4 | 0.031545 |
8 | 0.031119 |
0 | 0.028707 |
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | |
0 | onehotencoder__PrimaryPropertyType_Distributio... | onehotencoder__PrimaryPropertyType_Hospital | onehotencoder__PrimaryPropertyType_Hotel | onehotencoder__PrimaryPropertyType_K-12 School | onehotencoder__PrimaryPropertyType_Large Office | onehotencoder__PrimaryPropertyType_Medical Office | onehotencoder__PrimaryPropertyType_Mixed Use P... | onehotencoder__PrimaryPropertyType_Office | onehotencoder__PrimaryPropertyType_Other | onehotencoder__PrimaryPropertyType_Refrigerate... | ... | onehotencoder__Neighborhood_Northwest | onehotencoder__Neighborhood_SOUTHEAST | onehotencoder__Neighborhood_SOUTHWEST | remainder__Latitude | remainder__Longitude | remainder__YearBuilt | remainder__NumberofBuildings | remainder__NumberofFloors | remainder__ENERGYSTARScore | remainder__BuildingAge |
1 rows × 41 columns
Energy StarScore is just the 6th in the top ten of feature importance. It is not really usefull.
X,y=DataNRCO2E.drop(["OSEBuildingID","TotalGHGEmissions","SiteEnergyUse(kBtu)"], axis=1), DataNRCO2E["SiteEnergyUse(kBtu)"]
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', StandardScaler(), numerical_ix.values)]
col_transform = ColumnTransformer(transformers=t)
# define the models
modelDummyReg= DummyRegressor(strategy="mean")
modelLR= LinearRegression()
modelSVR = SVR(kernel='rbf',gamma='scale',C=100)
modelXGB_T2 = XGBRegressor()
# define the data preparation and modeling pipeline
pipelineDummyReg = Pipeline(steps=[('prep',col_transform), ('m', modelDummyReg)])
pipelineLR = Pipeline(steps=[('prep',col_transform), ('m', modelLR)])
pipelineSVR = Pipeline(steps=[('prep',col_transform), ('m', modelSVR)])
pipelineXGB = Pipeline(steps=[('prep',col_transform), ('m', modelXGB_T2)])
# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresDummyReg = absolute(scoresDummyReg)
# summarize the model performance
print('DummyReg MAE: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))
# evaluate the pipeline using cross validation and calculate MAE
scoresLR = cross_val_score(pipelineLR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresLR = absolute(scoresLR)
# summarize the model performance
print('LinearRegression MAE: %.3f (%.3f)' % (mean(scoresLR), std(scoresLR)))
# evaluate the pipeline using cross validation and calculate MAE
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresSVR = absolute(scoresSVR)
# summarize the model performance
print('SVR MAE: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))
# evaluate the pipeline using cross validation and calculate MAE
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresXGB = absolute(scoresXGB)
# summarize the model performance
print('XGB MAE: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))
DummyReg MAE: 1.066 (0.097) LinearRegression MAE: 0.611 (0.038) SVR MAE: 0.712 (0.070) XGB MAE: 0.577 (0.049)
# evaluate the pipeline using cross validation and calculate R2
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('DummyReg r2: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))
# evaluate the pipeline using cross validation and calculate R2
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('SVR r2: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))
# evaluate the pipeline using cross validation and calculate r2
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('XGB r2: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))
DummyReg r2: -0.011 (0.012) SVR r2: 0.454 (0.119) XGB r2: 0.648 (0.074)
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
transformer = make_column_transformer(
(OneHotEncoder(), ['PrimaryPropertyType', 'Neighborhood']),
transformed = transformer.fit_transform(X).toarray()
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
X_train,X_test,y_train,y_test = train_test_split(transformed_df,y,test_size=0.3,random_state=42),y_train)
[09:32:45] WARNING: /workspace/src/objective/ reg:linear is now deprecated in favor of reg:squarederror.
ypred | SiteEnergyUse(kBtu) | |
0 | 9.411165e+06 | 7.226363e+06 |
1 | 1.160892e+07 | 8.387933e+06 |
2 | 3.726919e+07 | 7.258702e+07 |
3 | 9.016754e+06 | 6.794584e+06 |
4 | 1.540815e+07 | 1.417261e+07 |
... | ... | ... |
981 | 1.798050e+06 | 4.420650e+06 |
982 | 2.200798e+06 | 1.325973e+06 |
983 | 1.222920e+06 | 5.026677e+05 |
984 | 7.690450e+05 | 3.878100e+05 |
985 | 1.485874e+06 | 8.497457e+05 |
986 rows × 2 columns